Importation des bibliothèques
import numpy as np
import pandas as pd
import sqlite3 as sql
from sqlite3 import Error
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from sklearn.cluster import KMeans
Création de la fonction de connexion à la base de données
def create_sql_connection(path):
conn = None
try:
conn = sql.connect(path)
print(sql.version)
except Error as e:
print(e)
finally:
return conn
db_connection = create_sql_connection('E-COMMERCE.sqlite')
db_cursor = db_connection.cursor()
2.6.0
Importation de la table 'Customer'
db_cursor.execute('''DROP TABLE customer''')
db_cursor.execute('''CREATE TABLE customer (customer_id text, customer_unique_id text, customer_zip_code_prefix int,
customer_city text, customer_state text)''')
<sqlite3.Cursor at 0x127deb8f0>
customers = pd.read_csv('data/customer.csv')
customers = customers[customers['customer_city'].notna()]
customers.to_sql('customer', db_connection, if_exists='replace', index = False)
test = pd.read_sql('SELECT * FROM customer', db_connection)
test.head()
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|
| 0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | 14409.0 | franca | SP |
| 1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | 9790.0 | sao bernardo do campo | SP |
| 2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | 1151.0 | sao paulo | SP |
| 3 | b2b6027bc5c5109e529d4dc6358b12c3 | 259dac757896d24d7702b9acbbff3f3c | 8775.0 | mogi das cruzes | SP |
| 4 | 4f2d8ab171c80ec8364f7c12e35b23ad | 345ecd01c38d18a9036ed96c73b8d066 | 13056.0 | campinas | SP |
Importation de la table 'Orders'
'''
order_id,"customer_id","order_status","order_purchase_timestamp","order_approved_at",
"order_delivered_carrier_date","order_delivered_customer_date","order_estimated_delivery_date"
'''
db_cursor.execute('''DROP TABLE orders''')
db_cursor.execute('''CREATE TABLE orders (order_id text, customer_id text, order_status text,
order_purchase_timestamp date, order_approved_at date, order_delivered_carrier_date date,
order_delivered_customer_date date, order_estimated_delivery_date date)''')
<sqlite3.Cursor at 0x127deb8f0>
orders = pd.read_csv('data/orders.csv')
orders.to_sql('orders', db_connection, if_exists='replace', index = False)
test = pd.read_sql('SELECT * FROM orders', db_connection)
test.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 00:00:00 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 00:00:00 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 00:00:00 |
Importation de la table 'Items'
'''
order_id,"order_item_id","product_id","seller_id","shipping_limit_date","price","freight_value"
'''
db_cursor.execute('''DROP TABLE items''')
db_cursor.execute('''CREATE TABLE items (order_id text, product_id text,
seller_id text, shipping_limit_date date, price real,
freight_value real)''')
<sqlite3.Cursor at 0x127deb8f0>
items = pd.read_csv('data/items.csv')
items = items.drop("order_item_id", axis=1)
items = items.drop_duplicates()
items.to_sql('items', db_connection, if_exists='replace', index = False)
test = pd.read_sql('SELECT * FROM items', db_connection)
test.head()
| order_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-15 10:10:18 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-13 13:57:51 | 199.90 | 18.14 |
Importation de la table 'Products'
'''
product_id,product_category_name,product_name_lenght,product_description_lenght,
product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,
product_category_name_english
'''
db_cursor.execute('''DROP TABLE products''')
db_cursor.execute('''CREATE TABLE products (product_id text, product_category_name text, product_name_lenght int,
product_description_lenght int, product_photos_qty int, product_weight_g int,
product_length_cm int, product_height_cm int, product_width_cm int, product_category_name_english)''')
<sqlite3.Cursor at 0x127deb8f0>
products = pd.read_csv('data/products.csv')
products.to_sql('products', db_connection, if_exists='replace', index = False)
test = pd.read_sql('SELECT * FROM products', db_connection)
test.head()
| product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | product_category_name_english | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | perfumery | 40.0 | 287.0 | 1.0 | 225.0 | 16.0 | 10.0 | 14.0 | perfumery |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | art | 44.0 | 276.0 | 1.0 | 1000.0 | 30.0 | 18.0 | 20.0 | art |
| 2 | 96bd76ec8810374ed1b65e291975717f | sports_leisure | 46.0 | 250.0 | 1.0 | 154.0 | 18.0 | 9.0 | 15.0 | sports_leisure |
| 3 | cef67bcfe19066a932b7673e239eb23d | baby | 27.0 | 261.0 | 1.0 | 371.0 | 26.0 | 4.0 | 26.0 | baby |
| 4 | 9dc1a7de274444849c219cff195d0b71 | housewares | 37.0 | 402.0 | 4.0 | 625.0 | 20.0 | 17.0 | 13.0 | housewares |
Create a customer vision that centralizes sales by customers and identify top customers.
q2 = pd.read_sql('''SELECT customer.customer_unique_id As Customer, customer_zip_code_prefix as "Zip Code", customer_city as City, sum(price) as Sales
FROM customer
INNER JOIN orders on orders.customer_id = customer.customer_id
INNER JOIN items on items.order_id = orders.order_id
WHERE customer_unique_id IS NOT NULL
GROUP BY Customer, "Zip Code", City
ORDER BY Sales DESC
LIMIT 10''', db_connection)
q2
| Customer | Zip Code | City | Sales | |
|---|---|---|---|---|
| 0 | da122df9eeddfedc1dc1f5349a1a690c | 28970.0 | araruama | 7388.00 |
| 1 | dc4802a71eae9be1dd28f5d788ceb526 | 79042.0 | campo grande | 6735.00 |
| 2 | 459bef486812aa25204be022145caa62 | 29066.0 | vitoria | 6729.00 |
| 3 | ff4159b92c40ebe40454e3e6a7c35ed6 | 17506.0 | marilia | 6499.00 |
| 4 | eebb5dda148d3893cdaf5b5ca3040ccb | 9341.0 | maua | 4690.00 |
| 5 | 48e1ac109decbb87765a3eade6854098 | 58038.0 | joao pessoa | 4590.00 |
| 6 | a229eba70ec1c2abef51f04987deb7a5 | 24020.0 | niteroi | 4400.00 |
| 7 | edde2314c6c30e864a128ac95d6b2112 | 2995.0 | sao paulo | 4399.87 |
| 8 | fa562ef24d41361e476e748681810e1e | 35340.0 | bom jesus do galho | 4099.99 |
| 9 | ca27f3dac28fb1063faddd424c9d95fa | 34018.0 | nova lima | 4059.00 |
Vérification du nombre de ventes pour le premier client de la liste
q2 = pd.read_sql('''
SELECT customer.customer_unique_id As Customer, items.price as Sales
FROM customer
INNER JOIN orders on orders.customer_id = customer.customer_id
INNER JOIN items on items.order_id = orders.order_id
WHERE customer.customer_unique_id = 'da122df9eeddfedc1dc1f5349a1a690c'
AND customer_unique_id IS NOT NULL
''', db_connection)
q2
| Customer | Sales | |
|---|---|---|
| 0 | da122df9eeddfedc1dc1f5349a1a690c | 4799.0 |
| 1 | da122df9eeddfedc1dc1f5349a1a690c | 2589.0 |
How many customers are repeaters ?
q3 = pd.read_sql('''SELECT customer.customer_unique_id As Customer, COUNT(orders.order_id) as Orders
FROM customer
INNER JOIN orders on orders.customer_id = customer.customer_id
INNER JOIN items on orders.order_id = items.order_id
WHERE customer.customer_id IS NOT NULL
GROUP BY Customer
ORDER BY Orders DESC
LIMIT 10''', db_connection)
q3
| Customer | Orders | |
|---|---|---|
| 0 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 16 |
| 1 | 3e43e6105506432c953e165fb2acf44c | 13 |
| 2 | ca77025e7201e3b30c44b472ff346268 | 9 |
| 3 | 1b6c7548a2a1f9037c1fd3ddfed95f33 | 9 |
| 4 | c8ed31310fc440a3f8031b177f9842c3 | 8 |
| 5 | 6469f99c1f9dfae7733b25662e7f1782 | 8 |
| 6 | d97b3cfb22b0d6b25ac9ed4e9c2d481b | 7 |
| 7 | bf869f6a89c8ba217f47e22359f884f2 | 7 |
| 8 | b09f1bff1aff13c997af071602bc048b | 7 |
| 9 | a40096fc0a3862e9e12bc55b5f8e6ab2 | 7 |
Vérification du nombre de commandes du premier clients de la liste
q3 = pd.read_sql('''SELECT *
FROM customer
INNER JOIN orders on orders.customer_id = customer.customer_id
INNER JOIN items on orders.order_id = items.order_id
WHERE customer.customer_id IS NOT NULL AND customer.customer_unique_id = '8d50f5eadf50201ccdcedfb9e2ac8455' ''', db_connection)
q3.head(20)
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1bd3585471932167ab72a84955ebefea | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | b850a16d8faf65a74c51287ef34379ce | 1bd3585471932167ab72a84955ebefea | delivered | 2017-11-22 20:01:53 | 2017-11-22 20:12:32 | 2017-11-24 16:07:56 | 2017-11-27 18:49:13 | 2017-12-04 00:00:00 | b850a16d8faf65a74c51287ef34379ce | 5bccc8fe44471df6c7b7e08176837d2f | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-11-28 20:12:32 | 13.99 | 7.78 |
| 1 | a8fabc805e9a10a3c93ae5bff642b86b | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 23427a6bd9f8fd1b51f1b1e5cc186ab8 | a8fabc805e9a10a3c93ae5bff642b86b | delivered | 2018-05-21 22:44:31 | 2018-05-22 01:53:35 | 2018-05-22 14:18:00 | 2018-05-23 15:33:09 | 2018-05-29 00:00:00 | 23427a6bd9f8fd1b51f1b1e5cc186ab8 | 5cb96c51c55f57503465e4d2558dc053 | db4350fd57ae30082dec7acbaacc17f9 | 2018-05-24 01:53:35 | 45.99 | 7.39 |
| 2 | 897b7f72042714efaa64ac306ba0cafc | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | c2213109a2cc0e75d55585b7aaac6d97 | 897b7f72042714efaa64ac306ba0cafc | delivered | 2018-08-07 23:32:14 | 2018-08-07 23:45:21 | 2018-08-09 13:35:00 | 2018-08-10 20:26:44 | 2018-08-13 00:00:00 | c2213109a2cc0e75d55585b7aaac6d97 | de7df82eb1e48c456b020c2cfb03aeed | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2018-08-12 23:45:21 | 26.99 | 7.45 |
| 3 | b2b13de0770e06de50080fea77c459e6 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 369634708db140c5d2c4e365882c443a | b2b13de0770e06de50080fea77c459e6 | delivered | 2017-06-18 22:56:48 | 2017-06-18 23:10:19 | 2017-06-19 20:12:26 | 2017-06-23 12:55:50 | 2017-07-07 00:00:00 | 369634708db140c5d2c4e365882c443a | d83509907a19c72e1e4cdde78b8177ec | 94e93ce877be27a515118dbfd2c2be41 | 2017-06-22 23:10:19 | 39.90 | 11.85 |
| 4 | 42dbc1ad9d560637c9c4c1533746f86d | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | bf92c69b7cc70f7fc2c37de43e366173 | 42dbc1ad9d560637c9c4c1533746f86d | delivered | 2017-07-24 22:11:50 | 2017-07-24 22:25:14 | 2017-07-26 01:42:03 | 2017-07-31 16:59:58 | 2017-08-15 00:00:00 | bf92c69b7cc70f7fc2c37de43e366173 | e01bc2ad5c6f46c0f53f73379fcd602e | a3a38f4affed601eb87a97788c949667 | 2017-07-30 22:25:14 | 149.90 | 16.81 |
| 5 | dfb941d6f7b02f57a44c3b7c3fefb44b | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 4f62d593acae92cea3c5662c76122478 | dfb941d6f7b02f57a44c3b7c3fefb44b | delivered | 2017-07-18 23:10:58 | 2017-07-18 23:23:26 | 2017-07-20 19:00:02 | 2017-07-21 16:19:40 | 2017-07-31 00:00:00 | 4f62d593acae92cea3c5662c76122478 | 94cc774056d3f2b0dc693486a589025e | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-07-24 23:23:26 | 13.99 | 7.78 |
| 6 | 65f9db9dd07a4e79b625effa4c868fcb | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 112eb6f37f1b9dabbced368fbbc6c9ef | 65f9db9dd07a4e79b625effa4c868fcb | delivered | 2018-07-23 21:53:02 | 2018-07-24 10:31:34 | 2018-07-25 10:25:00 | 2018-07-26 18:29:28 | 2018-08-02 00:00:00 | 112eb6f37f1b9dabbced368fbbc6c9ef | 41f6cb7c3b1200749326e50106f32d58 | db4350fd57ae30082dec7acbaacc17f9 | 2018-07-30 09:43:43 | 99.00 | 8.85 |
| 7 | 1c62b48fb34ee043310dcb233caabd2e | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 519203404f6116d406a970763ee75799 | 1c62b48fb34ee043310dcb233caabd2e | delivered | 2017-08-05 08:59:43 | 2017-08-05 09:10:13 | 2017-08-07 18:50:00 | 2017-08-09 15:22:28 | 2017-08-25 00:00:00 | 519203404f6116d406a970763ee75799 | 5fb61f482620cb672f5e586bb132eae9 | 94e93ce877be27a515118dbfd2c2be41 | 2017-08-10 09:10:13 | 69.90 | 11.99 |
| 8 | a682769c4bc10fc6ef2101337a6c83c9 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | d3582fd5ccccd9cb229a63dfb417c86f | a682769c4bc10fc6ef2101337a6c83c9 | delivered | 2018-08-20 19:14:26 | 2018-08-20 19:30:05 | 2018-08-21 15:11:00 | 2018-08-24 14:08:43 | 2018-09-04 00:00:00 | d3582fd5ccccd9cb229a63dfb417c86f | df473738565b52f77b4e22b328b41576 | 01266d4c46afa519678d16a8b683d325 | 2018-08-22 19:30:05 | 23.40 | 15.26 |
| 9 | 6289b75219d757a56c0cce8d9e427900 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 6bdf325f0966e3056651285c0aed5aad | 6289b75219d757a56c0cce8d9e427900 | delivered | 2018-05-22 23:08:55 | 2018-05-22 23:36:01 | 2018-05-23 19:02:00 | 2018-05-24 11:58:23 | 2018-05-30 00:00:00 | 6bdf325f0966e3056651285c0aed5aad | d6354128c28cc56532ba7393d9373083 | 412a4720f3e9431b4afa1476a1acddbe | 2018-05-24 23:31:13 | 51.80 | 11.15 |
| 10 | 3414a9c813e3ca02504b8be8b2deb27f | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | a1a6eadae352582010f449d8527b55f3 | 3414a9c813e3ca02504b8be8b2deb27f | delivered | 2018-08-18 12:39:19 | 2018-08-18 12:50:37 | 2018-08-20 14:43:00 | 2018-08-24 17:26:42 | 2018-09-03 00:00:00 | a1a6eadae352582010f449d8527b55f3 | 426f910ccd39ae5e7d55013c466fe343 | 0691148aee60ca47977c187804f935ae | 2018-08-23 12:50:37 | 72.90 | 23.50 |
| 11 | 0e4fdc084a6b9329ed55d62dcd653ccf | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 5d848f3d93a493c1c8955e018240e7ca | 0e4fdc084a6b9329ed55d62dcd653ccf | shipped | 2017-05-15 23:30:03 | 2017-05-15 23:42:34 | 2017-05-17 10:42:20 | None | 2017-05-26 00:00:00 | 5d848f3d93a493c1c8955e018240e7ca | 4e1d2ef2974c85d82582edfe594a4f57 | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-05-21 23:42:34 | 14.99 | 7.78 |
| 12 | f5188d99e9281e214a4a7d1b139a8229 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 838f8e96cf584a1d8d22eaf7143c7772 | f5188d99e9281e214a4a7d1b139a8229 | delivered | 2018-07-04 22:35:13 | 2018-07-05 16:27:55 | 2018-07-06 10:15:00 | 2018-07-10 21:27:30 | 2018-07-17 00:00:00 | 838f8e96cf584a1d8d22eaf7143c7772 | fe7e33eaa09b5a4a3d3aa477280db875 | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2018-07-08 23:31:20 | 22.99 | 7.42 |
| 13 | 89be66634d68fa73a95499b6352e085d | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | cd4b336a02aacabd0ef22f6db711f95e | 89be66634d68fa73a95499b6352e085d | delivered | 2017-10-18 23:25:04 | 2017-10-19 00:36:08 | 2017-10-20 17:11:50 | 2017-10-23 18:33:01 | 2017-10-30 00:00:00 | cd4b336a02aacabd0ef22f6db711f95e | 9169f70b7ce1138ac35678432d1e5ff2 | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-10-25 00:36:08 | 29.99 | 7.78 |
| 14 | 0bf8bf19944a7f8b40ba86fef778ca7c | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | e3071b7624445af6e4f3a1b23718667d | 0bf8bf19944a7f8b40ba86fef778ca7c | delivered | 2017-09-05 22:14:52 | 2017-09-05 22:30:56 | 2017-09-06 15:26:12 | 2017-09-11 13:27:49 | 2017-09-22 00:00:00 | e3071b7624445af6e4f3a1b23718667d | 88159bac10e3d4f4b69b157406c7f70f | fdaaf5bfda82b7b80535610c831b8d09 | 2017-09-12 22:30:56 | 39.90 | 11.85 |
| 15 | 9a1afef458843a022e431f4cb304dfe9 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP | 89d9b111d2b990deb5f5f9769f92800b | 9a1afef458843a022e431f4cb304dfe9 | delivered | 2017-10-29 16:58:02 | 2017-10-29 17:10:09 | 2017-10-30 15:58:52 | 2017-10-31 15:33:47 | 2017-11-10 00:00:00 | 89d9b111d2b990deb5f5f9769f92800b | 94cc774056d3f2b0dc693486a589025e | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-11-03 17:10:09 | 13.99 | 7.78 |
q4 = pd.read_sql('''
SELECT products.product_category_name_english As Category, SUM(items.price) / COUNT(orders.customer_id) As Price
FROM products
INNER JOIN items on items.product_id = products.product_id
INNER JOIN orders on items.order_id = orders.order_id
GROUP BY Category
ORDER BY Price DESC
''', db_connection)
q4
| Category | Price | |
|---|---|---|
| 0 | computers | 1141.459558 |
| 1 | small_appliances_home_oven_and_coffee | 624.285658 |
| 2 | home_appliances_2 | 483.264786 |
| 3 | agro_industry_and_commerce | 351.167650 |
| 4 | musical_instruments | 293.112673 |
| ... | ... | ... |
| 67 | drinks | 57.019371 |
| 68 | cds_dvds_musicals | 53.333333 |
| 69 | diapers_and_hygiene | 46.466296 |
| 70 | flowers | 33.204828 |
| 71 | home_comfort_2 | 28.452917 |
72 rows × 2 columns
Deuxième façon de répondre à cette question en utilisant la fonction "Average" en SQL
q4 = pd.read_sql('''
SELECT products.product_category_name_english As Category, AVG(items.price) As Price
FROM products
INNER JOIN items on items.product_id = products.product_id
INNER JOIN orders on items.order_id = orders.order_id
GROUP BY Category
ORDER BY Price DESC
''', db_connection)
q4
| Category | Price | |
|---|---|---|
| 0 | computers | 1141.459558 |
| 1 | small_appliances_home_oven_and_coffee | 624.285658 |
| 2 | home_appliances_2 | 483.264786 |
| 3 | agro_industry_and_commerce | 351.167650 |
| 4 | musical_instruments | 293.112673 |
| ... | ... | ... |
| 67 | drinks | 57.019371 |
| 68 | cds_dvds_musicals | 53.333333 |
| 69 | diapers_and_hygiene | 46.466296 |
| 70 | flowers | 33.204828 |
| 71 | home_comfort_2 | 28.452917 |
72 rows × 2 columns
Vérification des prix pour la catégorie fashion_childrens_clothes
q4 = pd.read_sql('''
SELECT products.product_category_name_english As Category, orders.customer_id as Order_count, items.price
FROM products
INNER JOIN items on items.product_id = products.product_id
INNER JOIN orders on items.order_id = orders.order_id
WHERE Category = 'fashion_childrens_clothes'
''', db_connection)
q4
| Category | Order_count | price | |
|---|---|---|---|
| 0 | fashion_childrens_clothes | f4b4417d306cbf8c55ea65c89d0261f7 | 89.99 |
| 1 | fashion_childrens_clothes | 035c08a68b237ef4afc82109e61a7380 | 89.99 |
| 2 | fashion_childrens_clothes | 14acb30bbc064408dcb8cdd3c92b0c97 | 110.00 |
| 3 | fashion_childrens_clothes | 03eaeeab9883ef63043d7f0e0325a370 | 49.90 |
| 4 | fashion_childrens_clothes | 2415312e36eac3ab3819f5b7f32522e5 | 110.00 |
| 5 | fashion_childrens_clothes | 85485f38b20ed1e287120d30cf39ee7d | 39.99 |
| 6 | fashion_childrens_clothes | 8325d6e7af2f2bfb58cf040729b232dd | 39.99 |
| 7 | fashion_childrens_clothes | 7968f8e8d5429c6a0611671f5a52ed2b | 39.99 |
q4 = pd.read_sql('''
SELECT products.product_category_name_english As Category, AVG(items.price) As Price
FROM products
INNER JOIN items on items.product_id = products.product_id
INNER JOIN orders on items.order_id = orders.order_id
WHERE products.product_category_name_english = 'fashion_childrens_clothes'
GROUP BY Category
ORDER BY Price DESC
''', db_connection)
q4
| Category | Price | |
|---|---|---|
| 0 | fashion_childrens_clothes | 71.23125 |
What are the most popular products?
q5 = pd.read_sql('''
SELECT products.product_id As Product, COUNT(orders.order_id) As Orders
FROM products
INNER JOIN items on items.product_id = products.product_id
INNER JOIN orders on items.order_id = orders.order_id
GROUP BY Product
ORDER BY Orders DESC
LIMIT 10
''', db_connection)
q5
| Product | Orders | |
|---|---|---|
| 0 | 99a4788cb24856965c36a24e339b6058 | 467 |
| 1 | aca2eb7d00ea1a7b8ebd4e68314663af | 431 |
| 2 | 422879e10f46682990de24d770e7f83d | 352 |
| 3 | d1c427060a0f73f6b889a5c7c61f2ac4 | 323 |
| 4 | 389d119b48cf3043d311335e499d9c6b | 311 |
| 5 | 53b36df67ebb7c41585e8d54d6772e08 | 306 |
| 6 | 368c6c730842d78016ad823897a372db | 291 |
| 7 | 53759a2ecddad2bb87a079a1f1519f73 | 287 |
| 8 | 154e7e31ebfa092203795c972e5804a6 | 269 |
| 9 | 2b4609f8948be18874494203496bc318 | 259 |
Vérification du nombre de commandes pour le produit sélectionné
q5 = pd.read_sql('''
SELECT *
FROM items
WHERE product_id = 'aca2eb7d00ea1a7b8ebd4e68314663af'
ORDER BY shipping_limit_date DESC
''', db_connection)
q5.count()
order_id 431 product_id 431 seller_id 431 shipping_limit_date 431 price 431 freight_value 431 dtype: int64
q5 = pd.read_sql('''
SELECT products.product_id As Product, COUNT(orders.order_id) As Orders
FROM products
INNER JOIN items on items.product_id = products.product_id
INNER JOIN orders on items.order_id = orders.order_id
WHERE products.product_id = 'aca2eb7d00ea1a7b8ebd4e68314663af'
GROUP BY Product
ORDER BY Orders DESC
LIMIT 10
''', db_connection)
q5
| Product | Orders | |
|---|---|---|
| 0 | aca2eb7d00ea1a7b8ebd4e68314663af | 431 |
What are repeat customers mostly buying ?
Création de la vue des clients récurrents
db_cursor.execute('''DROP VIEW v_repeat_customers''')
db_cursor.execute('''
CREATE VIEW v_repeat_customers AS
SELECT customer.customer_unique_id As Customer, COUNT(orders.order_id) as Orders
FROM customer
INNER JOIN orders on orders.customer_id = customer.customer_id
INNER JOIN items on items.order_id = orders.order_id
WHERE customer.customer_id IS NOT NULL
GROUP BY Customer
ORDER BY Orders DESC
''')
<sqlite3.Cursor at 0x127deb8f0>
Réponse à la question :
q6 = pd.read_sql('''
SELECT products.product_id As Product, COUNT(orders.order_id) As Orders
FROM products
INNER JOIN items on items.product_id = products.product_id
INNER JOIN orders on items.order_id = orders.order_id
INNER JOIN customer on orders.customer_id = customer.customer_id
WHERE customer.customer_unique_id IN
(SELECT Customer
FROM v_repeat_customers
WHERE Orders >= 2)
GROUP BY Product
ORDER BY Orders DESC
''', db_connection)
q6
| Product | Orders | |
|---|---|---|
| 0 | 99a4788cb24856965c36a24e339b6058 | 79 |
| 1 | 36f60d45225e60c7da4558b070ce4b60 | 76 |
| 2 | e53e557d5a159f5aa2c5e995dfdf244b | 56 |
| 3 | 389d119b48cf3043d311335e499d9c6b | 54 |
| 4 | 154e7e31ebfa092203795c972e5804a6 | 53 |
| ... | ... | ... |
| 8038 | 002959d7a0b0990fe2d69988affcbc80 | 1 |
| 8039 | 00250175f79f584c14ab5cecd80553cd | 1 |
| 8040 | 00210e41887c2a8ef9f791ebc780cc36 | 1 |
| 8041 | 001b72dfd63e9833e8c02742adf472e3 | 1 |
| 8042 | 0011c512eb256aa0dbbb544d8dffcf6e | 1 |
8043 rows × 2 columns
Vérification pour le produit sélectionné qui a été commandé par le client aux 17 commandes
q6 = pd.read_sql('''
SELECT *
FROM items
WHERE product_id = '41f6cb7c3b1200749326e50106f32d58'
''', db_connection)
q6.head()
| order_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|
| 0 | 112eb6f37f1b9dabbced368fbbc6c9ef | 41f6cb7c3b1200749326e50106f32d58 | db4350fd57ae30082dec7acbaacc17f9 | 2018-07-30 09:43:43 | 99.0 | 8.85 |
q6 = pd.read_sql('''
SELECT *
FROM orders
WHERE order_id = '112eb6f37f1b9dabbced368fbbc6c9ef'
''', db_connection)
q6.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | 112eb6f37f1b9dabbced368fbbc6c9ef | 65f9db9dd07a4e79b625effa4c868fcb | delivered | 2018-07-23 21:53:02 | 2018-07-24 10:31:34 | 2018-07-25 10:25:00 | 2018-07-26 18:29:28 | 2018-08-02 00:00:00 |
q6 = pd.read_sql('''
SELECT *
FROM customer
WHERE customer_id = '65f9db9dd07a4e79b625effa4c868fcb'
''', db_connection)
q6.head()
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|
| 0 | 65f9db9dd07a4e79b625effa4c868fcb | 8d50f5eadf50201ccdcedfb9e2ac8455 | 4045.0 | sao paulo | SP |
q6 = pd.read_sql('''
SELECT *
FROM customer
WHERE customer_unique_id = '8d50f5eadf50201ccdcedfb9e2ac8455'
''', db_connection)
q6.count()
customer_id 17 customer_unique_id 17 customer_zip_code_prefix 17 customer_city 17 customer_state 17 dtype: int64
Propose a simple customer segmentation. You may use any tool you find relevant.
Création de la vue des ventes en fonction des clients
db_cursor.execute('''DROP VIEW v_customer_sales''')
db_cursor.execute('''
CREATE VIEW v_customer_sales AS
SELECT customer.customer_unique_id As Customer, sum(price) as Sales
FROM customer
INNER JOIN orders on orders.customer_id = customer.customer_id
INNER JOIN items on items.order_id = orders.order_id
WHERE customer_unique_id IS NOT NULL
GROUP BY Customer
ORDER BY Sales DESC
''')
<sqlite3.Cursor at 0x127deb8f0>
qs = pd.read_sql('''
SELECT *
FROM v_customer_sales
''', db_connection)
qs
| Customer | Sales | |
|---|---|---|
| 0 | da122df9eeddfedc1dc1f5349a1a690c | 7388.00 |
| 1 | dc4802a71eae9be1dd28f5d788ceb526 | 6735.00 |
| 2 | 459bef486812aa25204be022145caa62 | 6729.00 |
| 3 | ff4159b92c40ebe40454e3e6a7c35ed6 | 6499.00 |
| 4 | eebb5dda148d3893cdaf5b5ca3040ccb | 4690.00 |
| ... | ... | ... |
| 95415 | cf3839da0d9492ad151690b65f45d800 | 2.99 |
| 95416 | bd06ce0e06ad77a7f681f1a4960a3cc6 | 2.29 |
| 95417 | 317cfc692e3f86c45c95697c61c853a6 | 2.20 |
| 95418 | d80730c15c647bc8f2ad77c908ba5ca9 | 0.85 |
| 95419 | b38211bd797f4fdd81a98b9d1754b606 | 0.85 |
95420 rows × 2 columns
Récupération de la vue du nombre de commandes par client
qs = pd.read_sql('''
SELECT *
FROM v_repeat_customers
''', db_connection)
qs
| Customer | Orders | |
|---|---|---|
| 0 | 8d50f5eadf50201ccdcedfb9e2ac8455 | 16 |
| 1 | 3e43e6105506432c953e165fb2acf44c | 13 |
| 2 | ca77025e7201e3b30c44b472ff346268 | 9 |
| 3 | 1b6c7548a2a1f9037c1fd3ddfed95f33 | 9 |
| 4 | c8ed31310fc440a3f8031b177f9842c3 | 8 |
| ... | ... | ... |
| 95415 | 0004aac84e0df4da2b147fca70cf8255 | 1 |
| 95416 | 0000f6ccb0745a6a4b88665a16c9f078 | 1 |
| 95417 | 0000f46a3911fa3c0805444483337064 | 1 |
| 95418 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 1 |
| 95419 | 0000366f3b9a7992bf8c76cfdf3221e2 | 1 |
95420 rows × 2 columns
Vérification du nombre de commande pour le premier client de la liste
qs = pd.read_sql('''
SELECT *
FROM items
WHERE order_id IN
(SELECT order_id
FROM orders
WHERE customer_id IN
(SELECT customer_id
FROM customer
WHERE customer_unique_id = '8d50f5eadf50201ccdcedfb9e2ac8455'))
''', db_connection)
qs
| order_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|
| 0 | 112eb6f37f1b9dabbced368fbbc6c9ef | 41f6cb7c3b1200749326e50106f32d58 | db4350fd57ae30082dec7acbaacc17f9 | 2018-07-30 09:43:43 | 99.00 | 8.85 |
| 1 | 23427a6bd9f8fd1b51f1b1e5cc186ab8 | 5cb96c51c55f57503465e4d2558dc053 | db4350fd57ae30082dec7acbaacc17f9 | 2018-05-24 01:53:35 | 45.99 | 7.39 |
| 2 | 369634708db140c5d2c4e365882c443a | d83509907a19c72e1e4cdde78b8177ec | 94e93ce877be27a515118dbfd2c2be41 | 2017-06-22 23:10:19 | 39.90 | 11.85 |
| 3 | 4f62d593acae92cea3c5662c76122478 | 94cc774056d3f2b0dc693486a589025e | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-07-24 23:23:26 | 13.99 | 7.78 |
| 4 | 519203404f6116d406a970763ee75799 | 5fb61f482620cb672f5e586bb132eae9 | 94e93ce877be27a515118dbfd2c2be41 | 2017-08-10 09:10:13 | 69.90 | 11.99 |
| 5 | 5d848f3d93a493c1c8955e018240e7ca | 4e1d2ef2974c85d82582edfe594a4f57 | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-05-21 23:42:34 | 14.99 | 7.78 |
| 6 | 6bdf325f0966e3056651285c0aed5aad | d6354128c28cc56532ba7393d9373083 | 412a4720f3e9431b4afa1476a1acddbe | 2018-05-24 23:31:13 | 51.80 | 11.15 |
| 7 | 838f8e96cf584a1d8d22eaf7143c7772 | fe7e33eaa09b5a4a3d3aa477280db875 | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2018-07-08 23:31:20 | 22.99 | 7.42 |
| 8 | 89d9b111d2b990deb5f5f9769f92800b | 94cc774056d3f2b0dc693486a589025e | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-11-03 17:10:09 | 13.99 | 7.78 |
| 9 | a1a6eadae352582010f449d8527b55f3 | 426f910ccd39ae5e7d55013c466fe343 | 0691148aee60ca47977c187804f935ae | 2018-08-23 12:50:37 | 72.90 | 23.50 |
| 10 | b850a16d8faf65a74c51287ef34379ce | 5bccc8fe44471df6c7b7e08176837d2f | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-11-28 20:12:32 | 13.99 | 7.78 |
| 11 | bf92c69b7cc70f7fc2c37de43e366173 | e01bc2ad5c6f46c0f53f73379fcd602e | a3a38f4affed601eb87a97788c949667 | 2017-07-30 22:25:14 | 149.90 | 16.81 |
| 12 | c2213109a2cc0e75d55585b7aaac6d97 | de7df82eb1e48c456b020c2cfb03aeed | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2018-08-12 23:45:21 | 26.99 | 7.45 |
| 13 | cd4b336a02aacabd0ef22f6db711f95e | 9169f70b7ce1138ac35678432d1e5ff2 | 1da3aeb70d7989d1e6d9b0e887f97c23 | 2017-10-25 00:36:08 | 29.99 | 7.78 |
| 14 | d3582fd5ccccd9cb229a63dfb417c86f | df473738565b52f77b4e22b328b41576 | 01266d4c46afa519678d16a8b683d325 | 2018-08-22 19:30:05 | 23.40 | 15.26 |
| 15 | e3071b7624445af6e4f3a1b23718667d | 88159bac10e3d4f4b69b157406c7f70f | fdaaf5bfda82b7b80535610c831b8d09 | 2017-09-12 22:30:56 | 39.90 | 11.85 |
Jointure entre la vue du nombre de commandes et de ventes en fonction des clients
qs = pd.read_sql('''
SELECT v_customer_sales.Customer, v_customer_sales.Sales, v_repeat_customers.Orders
FROM v_customer_sales
INNER JOIN v_repeat_customers ON v_repeat_customers.Customer = v_customer_sales.Customer
''', db_connection)
qs
| Customer | Sales | Orders | |
|---|---|---|---|
| 0 | da122df9eeddfedc1dc1f5349a1a690c | 7388.00 | 2 |
| 1 | dc4802a71eae9be1dd28f5d788ceb526 | 6735.00 | 1 |
| 2 | 459bef486812aa25204be022145caa62 | 6729.00 | 1 |
| 3 | ff4159b92c40ebe40454e3e6a7c35ed6 | 6499.00 | 1 |
| 4 | eebb5dda148d3893cdaf5b5ca3040ccb | 4690.00 | 1 |
| ... | ... | ... | ... |
| 95415 | cf3839da0d9492ad151690b65f45d800 | 2.99 | 1 |
| 95416 | bd06ce0e06ad77a7f681f1a4960a3cc6 | 2.29 | 1 |
| 95417 | 317cfc692e3f86c45c95697c61c853a6 | 2.20 | 1 |
| 95418 | d80730c15c647bc8f2ad77c908ba5ca9 | 0.85 | 1 |
| 95419 | b38211bd797f4fdd81a98b9d1754b606 | 0.85 | 1 |
95420 rows × 3 columns
Histogramme de la colonne des ventes ainsi que de la colonne du nombre de commandes
fig = make_subplots(rows=2, cols=1,
subplot_titles=("Nombre de commandes",
"Ventes"))
fig.append_trace(go.Histogram(x=qs.Orders),
row=1, col=1)
fig.append_trace(go.Histogram(x=qs.Sales),
row=2, col=1)
fig.update_layout(height=800, width=800,
title_text="Histogramme des ventes et du nombre de commandes")
fig.show()
Création des colonnes logarithmiques des ventes et du nombre de commandes
qs["log_Sales"] = np.log1p(qs["Sales"])
qs["log_Orders"] = np.log1p(qs["Orders"])
fig = make_subplots(rows=2, cols=1,
subplot_titles=("Nombre de commandes (log)",
"Ventes (log)"))
fig.append_trace(go.Histogram(x=qs.log_Orders),
row=1, col=1)
fig.append_trace(go.Histogram(x=qs.log_Sales),
row=2, col=1)
fig.update_layout(height=800, width=800,
title_text="Histogramme des ventes et du nombre de commandes (log)")
fig.show()
Aperçu des colonnes logarithmes des ventes et du nombre de commandes
qs.iloc[:,3:]
| log_Sales | log_Orders | |
|---|---|---|
| 0 | 8.907748 | 1.098612 |
| 1 | 8.815222 | 0.693147 |
| 2 | 8.814330 | 0.693147 |
| 3 | 8.779557 | 0.693147 |
| 4 | 8.453401 | 0.693147 |
| ... | ... | ... |
| 95415 | 1.383791 | 0.693147 |
| 95416 | 1.190888 | 0.693147 |
| 95417 | 1.163151 | 0.693147 |
| 95418 | 0.615186 | 0.693147 |
| 95419 | 0.615186 | 0.693147 |
95420 rows × 2 columns
Création de la baseline du modèle de segmentation client
kmeans_model = KMeans(init='k-means++', max_iter=500, random_state=42)
kmeans_model.fit(qs.iloc[:,3:])
print("Somme des carrés des distances du premier modèle K-Means:", kmeans_model.inertia_)
def make_list_of_K(K, dataframe):
clusters = list(range(1, K+1))
inertia_values=[]
for c in clusters:
model = KMeans(n_clusters = c, init='k-means++', max_iter=500, random_state=42)
model.fit(dataframe)
inertia_values.append(model.inertia_)
return inertia_values
Somme des carrés des distances du premier modèle K-Means: 3967.504771783319
Sauvegarde de l'inertie des modèles pour un nombre de clusters allant de 1 à 15
results = make_list_of_K(15, qs.iloc[:, 3:])
k_values_distances = pd.DataFrame({
"Nombre de clusters": list(range(1, 16)),
"Somme des carrés des distances": results
})
Visualisation de la méthode du coude
fig = go.Figure()
fig.add_trace(go.Scatter(x=k_values_distances["Nombre de clusters"],
y=k_values_distances["Somme des carrés des distances"],
mode='lines+markers'))
fig.update_layout(xaxis = dict(
tickmode = 'linear',
tick0 = 1,
dtick = 1),
title_text="Somme des carrés des distances en fonction du nombre de clusters",
xaxis_title="Nombre de clusters",
yaxis_title="Somme des carrés des distances")
fig.show()
Modèle de segmentation final
final_kmeans_model = KMeans(n_clusters = 4, init='k-means++', max_iter=500, random_state=42)
final_kmeans_model.fit_predict(qs.iloc[:,3:])
array([2, 2, 2, ..., 1, 1, 1], dtype=int32)
qs["clusters"] = final_kmeans_model.labels_
qs["cluster_name"] = qs["clusters"].astype(str)
fig = px.scatter(qs,
x="log_Sales",
y="log_Orders",
color='cluster_name',
hover_data=["Sales",
"Orders",
"Customer"],
category_orders = {"cluster_name": ["0", "1", "2", "3"]},
)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()
qs
| Customer | Sales | Orders | log_Sales | log_Orders | clusters | cluster_name | |
|---|---|---|---|---|---|---|---|
| 0 | da122df9eeddfedc1dc1f5349a1a690c | 7388.00 | 2 | 8.907748 | 1.098612 | 2 | 2 |
| 1 | dc4802a71eae9be1dd28f5d788ceb526 | 6735.00 | 1 | 8.815222 | 0.693147 | 2 | 2 |
| 2 | 459bef486812aa25204be022145caa62 | 6729.00 | 1 | 8.814330 | 0.693147 | 2 | 2 |
| 3 | ff4159b92c40ebe40454e3e6a7c35ed6 | 6499.00 | 1 | 8.779557 | 0.693147 | 2 | 2 |
| 4 | eebb5dda148d3893cdaf5b5ca3040ccb | 4690.00 | 1 | 8.453401 | 0.693147 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 95415 | cf3839da0d9492ad151690b65f45d800 | 2.99 | 1 | 1.383791 | 0.693147 | 1 | 1 |
| 95416 | bd06ce0e06ad77a7f681f1a4960a3cc6 | 2.29 | 1 | 1.190888 | 0.693147 | 1 | 1 |
| 95417 | 317cfc692e3f86c45c95697c61c853a6 | 2.20 | 1 | 1.163151 | 0.693147 | 1 | 1 |
| 95418 | d80730c15c647bc8f2ad77c908ba5ca9 | 0.85 | 1 | 0.615186 | 0.693147 | 1 | 1 |
| 95419 | b38211bd797f4fdd81a98b9d1754b606 | 0.85 | 1 | 0.615186 | 0.693147 | 1 | 1 |
95420 rows × 7 columns